# Import required libraries
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
transactions = pd.read_csv("transactions.csv")

# Isolate key columns
columns = ["TransactionID", "TransactionAmount", "TransactionDuration", "AccountBalance"]
transactions = transactions[columns]

# Display the first rows
transactions.head()

# Select numerical features for anomaly detection
features = transactions[["TransactionAmount", "TransactionDuration", "AccountBalance"]]

# Train an IForest model
model = IForest(n_estimators=100, contamination=0.05, random_state=42)
model.fit(features)

# Add the anomaly scores to the dataset
transactions["Anomaly_Score"] = model.decision_function(features)

transactions.head()

C:\Users\newbe\anaconda3\Lib\site-packages\sklearn\utils\validation.py:2732: UserWarning: X has feature names, but IsolationForest was fitted without feature names
  warnings.warn(

# Flag transactions as anomalies based on the model's prediction
transactions["Anomaly"] = (model.predict(features) == 1).astype(int)

transactions.head()

C:\Users\newbe\anaconda3\Lib\site-packages\sklearn\utils\validation.py:2732: UserWarning: X has feature names, but IsolationForest was fitted without feature names
  warnings.warn(

# summary of anomalous transactions
anomalies_summary = transactions.loc[transactions["Anomaly"] == 1, ["TransactionID", "TransactionAmount", "TransactionDuration", "AccountBalance"]]

anomalies_summary.head()

# Plot the distribution of TransactionAmount for normal and anomalous transactions
plt.figure(figsize=(8, 6))
transactions[transactions["Anomaly"] == False]["TransactionAmount"].hist(bins=30, alpha=0.5, label="Normal", color="blue")
transactions[transactions["Anomaly"] == True]["TransactionAmount"].hist(bins=30, alpha=0.5, label="Anomalous", color="red")
plt.title("Transaction Amount Distribution")
plt.xlabel("Transaction Amount")
plt.ylabel("Frequency")
plt.legend()
plt.savefig("anomalies_histogram.png")

Column	Description
`TransactionID`	A unique identifier for each transaction.
`TransactionAmount`	The amount of money involved in the transaction (in USD).
`TransactionDuration`	Duration of the transaction (in seconds).
`AccountBalance`	The balance of the account after the transaction was processed (in USD).

	TransactionID	TransactionAmount	TransactionDuration	AccountBalance
0	TX000001	14.09	81	5112.21
1	TX000002	376.24	141	13758.91
2	TX000003	126.29	56	1122.35
3	TX000004	184.50	25	8569.06
4	TX000005	13.45	198	7429.40

	TransactionID	TransactionAmount	TransactionDuration	AccountBalance	Anomaly_Score
0	TX000001	14.09	81	5112.21	-0.127230
1	TX000002	376.24	141	13758.91	-0.043850
2	TX000003	126.29	56	1122.35	-0.150263
3	TX000004	184.50	25	8569.06	-0.105860
4	TX000005	13.45	198	7429.40	-0.096070

	TransactionID	TransactionAmount	TransactionDuration	AccountBalance	Anomaly_Score
0	TX000001	14.09	81	5112.21	-0.127230
1	TX000002	376.24	141	13758.91	-0.043850
2	TX000003	126.29	56	1122.35	-0.150263
3	TX000004	184.50	25	8569.06	-0.105860
4	TX000005	13.45	198	7429.40	-0.096070

	TransactionID	TransactionAmount	TransactionDuration	AccountBalance
41	TX000042	34.02	19	14214.48
74	TX000075	1212.51	24	605.95
85	TX000086	1340.19	30	8654.28
141	TX000142	1049.92	21	2037.85
146	TX000147	973.39	296	2042.22

Project Description¶

The Data¶

Compute an anomaly score for each transaction.¶

Which transactions are flagged as anomalies?¶

Create a summary of anomalous transactions.¶

What is the distribution of TransactionAmount for normal and anomalous transactions?¶